library(tidyverse)
library(ggplot2)
library(ggcorrplot)
library(corrplot)
library(leaps)
library(car)
library(Metrics)
library(reshape2)
library(ggpubr)
library(moments)
library(naniar)
library(VIM)
library(plotly)
data_input<-read_csv("Life Expectancy Data.csv")
str(data_input)
## spc_tbl_ [2,938 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ country : chr [1:2938] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : num [1:2938] 2015 2014 2013 2012 2011 ...
## $ status : chr [1:2938] "Developing" "Developing" "Developing" "Developing" ...
## $ life_expectancy : num [1:2938] 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ adult_mortality : num [1:2938] 263 271 268 272 275 279 281 287 295 295 ...
## $ infant_mortality : num [1:2938] 62 64 66 69 71 74 77 80 82 84 ...
## $ alcohol : num [1:2938] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percent_expenditure : num [1:2938] 71.3 73.5 73.2 78.2 7.1 ...
## $ hepatitis_B : num [1:2938] 65 62 64 67 68 66 63 64 63 64 ...
## $ measles : num [1:2938] 1154 492 430 2787 3013 ...
## $ BMI : num [1:2938] 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under_5_mortality : num [1:2938] 83 86 89 93 97 102 106 110 113 116 ...
## $ polio : num [1:2938] 6 58 62 67 68 66 63 64 63 58 ...
## $ total_expenditure : num [1:2938] 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ diphtheria : num [1:2938] 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV/AIDS_mortality : num [1:2938] 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:2938] 584.3 612.7 631.7 670 63.5 ...
## $ population : num [1:2938] 33736494 327582 31731688 3696958 2978599 ...
## $ thinness_1-19 : num [1:2938] 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness_5-9 : num [1:2938] 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ income_composition_of_resources: num [1:2938] 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ schooling : num [1:2938] 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
## - attr(*, "spec")=
## .. cols(
## .. country = col_character(),
## .. year = col_double(),
## .. status = col_character(),
## .. life_expectancy = col_double(),
## .. adult_mortality = col_double(),
## .. infant_mortality = col_double(),
## .. alcohol = col_double(),
## .. percent_expenditure = col_double(),
## .. hepatitis_B = col_double(),
## .. measles = col_double(),
## .. BMI = col_double(),
## .. under_5_mortality = col_double(),
## .. polio = col_double(),
## .. total_expenditure = col_double(),
## .. diphtheria = col_double(),
## .. `HIV/AIDS_mortality` = col_double(),
## .. GDP = col_double(),
## .. population = col_double(),
## .. `thinness_1-19` = col_double(),
## .. `thinness_5-9` = col_double(),
## .. income_composition_of_resources = col_double(),
## .. schooling = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(data_input)
## # A tibble: 6 × 22
## country year status life_…¹ adult…² infan…³ alcohol perce…⁴ hepat…⁵ measles
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanis… 2015 Devel… 65 263 62 0.01 71.3 65 1154
## 2 Afghanis… 2014 Devel… 59.9 271 64 0.01 73.5 62 492
## 3 Afghanis… 2013 Devel… 59.9 268 66 0.01 73.2 64 430
## 4 Afghanis… 2012 Devel… 59.5 272 69 0.01 78.2 67 2787
## 5 Afghanis… 2011 Devel… 59.2 275 71 0.01 7.10 68 3013
## 6 Afghanis… 2010 Devel… 58.8 279 74 0.01 79.7 66 1989
## # … with 12 more variables: BMI <dbl>, under_5_mortality <dbl>, polio <dbl>,
## # total_expenditure <dbl>, diphtheria <dbl>, `HIV/AIDS_mortality` <dbl>,
## # GDP <dbl>, population <dbl>, `thinness_1-19` <dbl>, `thinness_5-9` <dbl>,
## # income_composition_of_resources <dbl>, schooling <dbl>, and abbreviated
## # variable names ¹life_expectancy, ²adult_mortality, ³infant_mortality,
## # ⁴percent_expenditure, ⁵hepatitis_B
missing.rows = dim(data_input)[1] - dim(na.omit(data_input))[1]
missings_df <- data.frame(type=c("missing", "non-missing") ,count = c(missing.rows, dim(na.omit(data_input))[1]))
ggplot(missings_df, aes(fill=type, y="", x=count)) +
geom_bar(position="stack", stat="identity")+
ggtitle("Missing vs Non-missing row counts") +
xlab("Missing count") + ylab("")
vis_miss(data_input)
data_fill <- kNN(data_input)
data_fill <- subset(data_fill, select = country:schooling)
vis_miss(data_fill)
data<-data_fill[-c(1,2,3)]
glimpse(data)
## Rows: 2,938
## Columns: 19
## $ life_expectancy <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ adult_mortality <dbl> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant_mortality <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percent_expenditure <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ hepatitis_B <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ measles <dbl> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ BMI <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under_5_mortality <dbl> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ polio <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ total_expenditure <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ diphtheria <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ `HIV/AIDS_mortality` <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ GDP <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ population <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ `thinness_1-19` <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ `thinness_5-9` <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ income_composition_of_resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
ggplot(data, aes(x=life_expectancy)) +
geom_density(alpha=.3, fill="red", color="red", size=1.5)+
geom_vline(aes(xintercept=mean(life_expectancy)), size=1)+
ggtitle("Distribution density of Life.expectancy") +
theme(text = element_text(size = 18))
sprintf("Skewness: [%s]", toString(skewness(data$life_expectancy, na.rm = TRUE)))
## [1] "Skewness: [-0.640978390586901]"
data$life_expectancy <- sqrt(max(data$life_expectancy+1)- data$life_expectancy)
ggplot(data, aes(x=life_expectancy)) +
geom_density(alpha=.3, fill="red", color="red", size=1.5)+
geom_vline(aes(xintercept=mean(life_expectancy)))+
ggtitle("Distribution density of Life.expectancy") +
theme(text = element_text(size = 18))
corr <- round(cor(data), 3)
ggcorrplot(corr,type = "upper", lab = TRUE, , outline.color = "black", lab_size = 1, legend.title = "Correlation")+
ggtitle("Correlation Matrix")
model_linear <- lm(life_expectancy~ ., data = data)
vifs <- data.frame(vif(model_linear))
ggplot(vifs, aes(y=vif.model_linear., x=row.names(vifs))) +
geom_bar(aes(fill=vif.model_linear.>5),stat="identity")+
scale_y_continuous(trans = "sqrt", breaks = c(5, 10, 50, 100))+
geom_hline(yintercept = 5, colour = "red") +
ggtitle("VIF per feature") +
xlab("Featurs") + ylab("VIF") +
theme(axis.text.x=element_text(angle=40, hjust=1))
data_EDA <- subset(data, select = -c(infant_mortality, `thinness_5-9`
, GDP))
model_EDA <- lm(life_expectancy~ ., data = data_EDA)
vifs_EDA <- data.frame(vif(model_EDA))
ggplot(vifs_EDA, aes(y=vif.model_EDA., x=row.names(vifs_EDA))) +
geom_bar(aes(fill=vif.model_EDA.>5),stat="identity")+
scale_y_continuous(trans = "sqrt", breaks = c(5, 10, 50, 100))+
geom_hline(yintercept = 5, colour = "red") +
ggtitle("VIF per feature") +
xlab("Featurs") + ylab("VIF") +
theme(axis.text.x=element_text(angle=40, hjust=1))
data_visu <- read.csv("data_Visu.csv")
data_visu <- na.omit(data_visu)
g <- list(
showframe = TRUE,
showcoastlines = TRUE,
projection = list(type = 'Mercator')
)
plot <- plot_geo(data_fill, locationmode="country names")%>%
add_trace(locations=~country,
z=~life_expectancy,
frame=~year) %>%
layout(
title = with(data, paste('Mean Life Expectancy')),
geo = g
)
plot
fig1<-plot_ly(data_visu,x=~schooling,y=~life_expectancy,size=~population,frame = ~year,text=~country,color = ~Continent,log_x=TRUE,size_max=60)
fig1
fig2<-plot_ly(data_visu,x=~percent_expenditure,y=~life_expectancy,size=~population,frame = ~year,text=~country,color = ~Continent,log_x=TRUE,size_max=60)
fig2
ggplot(data_visu,aes(x=GDP,y=life_expectancy))+geom_point(color="red")+geom_smooth(method="lm")+facet_wrap(~Continent)
ggplot(data_visu,aes(x=income_composition_of_resources,y=life_expectancy))+geom_point(color="blue")+geom_smooth(method="lm")+facet_wrap(~Continent)
Sử dụng data đầy đủ để tạo model với tất cả cột
set.seed(20133076)
sample_full<-sample(c(TRUE,FALSE),nrow(data),replace=TRUE, prob = c(0.7,0.3))
train_full<-data[sample_full,]
x_test_full<-data[!sample_full,]
y_test_full<-data[!sample_full,]$life_expectancy
model_full<-lm(life_expectancy~.,data=train_full)
summary(model_full)
##
## Call:
## lm(formula = life_expectancy ~ ., data = train_full)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.48480 -0.22630 0.03941 0.27688 1.89028
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.290e+00 7.865e-02 79.978 < 2e-16 ***
## adult_mortality 1.928e-03 1.108e-04 17.404 < 2e-16 ***
## infant_mortality -7.091e-03 1.181e-03 -6.003 2.28e-09 ***
## alcohol -8.177e-03 3.339e-03 -2.449 0.014403 *
## percent_expenditure -3.921e-05 1.117e-05 -3.510 0.000457 ***
## hepatitis_B 1.360e-03 5.502e-04 2.472 0.013505 *
## measles 1.390e-06 1.196e-06 1.162 0.245478
## BMI -2.836e-03 6.806e-04 -4.166 3.23e-05 ***
## under_5_mortality 5.284e-03 8.666e-04 6.098 1.28e-09 ***
## polio -2.126e-03 6.229e-04 -3.414 0.000653 ***
## total_expenditure -1.794e-02 4.623e-03 -3.881 0.000107 ***
## diphtheria -3.982e-03 6.699e-04 -5.944 3.25e-09 ***
## `HIV/AIDS_mortality` 4.225e-02 2.326e-03 18.161 < 2e-16 ***
## GDP -5.224e-06 1.702e-06 -3.068 0.002180 **
## population 2.252e-10 2.619e-10 0.860 0.390010
## `thinness_1-19` -1.107e-03 7.179e-03 -0.154 0.877462
## `thinness_5-9` 1.272e-02 7.097e-03 1.792 0.073239 .
## income_composition_of_resources -9.451e-01 8.887e-02 -10.635 < 2e-16 ***
## schooling -8.426e-02 5.975e-03 -14.101 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4738 on 2064 degrees of freedom
## Multiple R-squared: 0.8038, Adjusted R-squared: 0.8021
## F-statistic: 469.7 on 18 and 2064 DF, p-value: < 2.2e-16
Kiểm tra
pred_full<-predict(model_full, newdata = x_test_full)
value_model <- data.frame("Full", rmse(pred_full,y_test_full), summary(model_full)$adj.r.squared,length(coef(model_full))-1)
colnames(value_model) = c("Model", "Root Mean Squared Error", "R squared", "Number of variable")
value_model
## Model Root Mean Squared Error R squared Number of variable
## 1 Full 0.481023 0.8020737 18
par(mfrow=c(2,2))
plot(model_full)
pred_acct_full <- data.frame(pred_full, y_test_full)
ggplot(pred_acct_full,aes(x=pred_full, y= y_test_full)) +
geom_point() +
geom_abline(intercept=0, slope=1) +
labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')
set.seed(20133082)
sample_step<-sample(c(TRUE,FALSE),nrow(data),replace=TRUE, prob = c(0.7,0.3))
train_step<-data[sample_step,]
x_test_step<-data[!sample_step,]
y_test_step<-data[!sample_step,]$life_expectancy
model_full_step<-lm(life_expectancy~.,data=train_step)
model_step<-step(model_full_step,direction = "both",test="F")
summary(model_step)
##
## Call:
## lm(formula = life_expectancy ~ adult_mortality + infant_mortality +
## alcohol + percent_expenditure + hepatitis_B + measles + BMI +
## under_5_mortality + polio + total_expenditure + diphtheria +
## `HIV/AIDS_mortality` + GDP + `thinness_5-9` + income_composition_of_resources +
## schooling, data = train_step)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40177 -0.22772 0.03268 0.28336 1.91790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.147e+00 7.926e-02 77.565 < 2e-16 ***
## adult_mortality 2.030e-03 1.136e-04 17.874 < 2e-16 ***
## infant_mortality -7.433e-03 1.332e-03 -5.579 2.74e-08 ***
## alcohol -9.462e-03 3.401e-03 -2.782 0.005448 **
## percent_expenditure -3.024e-05 1.126e-05 -2.685 0.007312 **
## hepatitis_B 1.085e-03 5.700e-04 1.904 0.057102 .
## measles 1.849e-06 1.272e-06 1.453 0.146269
## BMI -3.781e-03 7.030e-04 -5.378 8.42e-08 ***
## under_5_mortality 5.577e-03 9.931e-04 5.615 2.23e-08 ***
## polio -2.596e-03 6.556e-04 -3.959 7.79e-05 ***
## total_expenditure -1.403e-02 4.691e-03 -2.992 0.002807 **
## diphtheria -3.400e-03 6.941e-04 -4.899 1.04e-06 ***
## `HIV/AIDS_mortality` 4.056e-02 2.377e-03 17.064 < 2e-16 ***
## GDP -6.255e-06 1.750e-06 -3.575 0.000359 ***
## `thinness_5-9` 1.198e-02 3.333e-03 3.595 0.000332 ***
## income_composition_of_resources -8.372e-01 8.923e-02 -9.383 < 2e-16 ***
## schooling -7.658e-02 5.937e-03 -12.899 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4764 on 2017 degrees of freedom
## Multiple R-squared: 0.7986, Adjusted R-squared: 0.797
## F-statistic: 500 on 16 and 2017 DF, p-value: < 2.2e-16
Kiểm tra
pred_step<-predict(model_step, newdata = x_test_step)
value_model[nrow(value_model) + 1,] = c("AIC",rmse(y_test_step,pred_step), summary(model_step)$adj.r.squared, length(coef(model_step))-1)
value_model
## Model Root Mean Squared Error R squared Number of variable
## 1 Full 0.481022999289545 0.802073745562233 18
## 2 AIC 0.471017271177039 0.797046965097611 16
par(mfrow=c(2,2))
plot(model_step)
pred_acct_step <- data.frame(pred_step, y_test_step)
ggplot(pred_acct_step,aes(x=pred_step, y= y_test_step)) +
geom_point() +
geom_abline(intercept=0, slope=1) +
labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')
Lựa chọn theo 3 phương pháp:
Best subset
Forward inclusion
Backward elimination
theo đó dựa trên:
Residual sum of squares (RSS)
Adjusted R^2 (Adjr2)
Mallow’s Cp (CP)
Bayesian information criterion (BIC)
set.seed(20133035)
regfit.best <- regsubsets(life_expectancy~., data= data_EDA, nvmax = 18)
reg.summary <- summary(regfit.best)
par(mfrow=c(2,2))
#- residual sum of squares:
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
which.min(reg.summary$rss)
## [1] 15
points(15,reg.summary$rss[15], col="red",cex=2,pch=20)
# adjusted-R^2 with its largest value
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted Rsq",type="l")
which.max(reg.summary$adjr2)
## [1] 14
points(14,reg.summary$adjr2[14], col="red",cex=2,pch=20)
# Mallow's Cp with its smallest value
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 14
points(14,reg.summary$cp[14],col="red",cex=2,pch=20)
# BIC with its smallest value
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(reg.summary$bic)
## [1] 10
points(10,reg.summary$bic[15],col="red",cex=2,pch=20)
Có thể thấy BIC cho số biến ít nhất nên từ đây chỉ xét theo BIC.
par(mfrow=c(1,1))
set.seed(20133013)
regfit.fwd <- regsubsets(life_expectancy~.,data=data_EDA,nvmax=18,method="forward")
fwd.summary <-summary(regfit.fwd)
# fwd.summary
#set_plot_dimensions(8,6)
plot(fwd.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(fwd.summary$bic)
## [1] 10
points(10,fwd.summary$bic[10],col="red",cex=2,pch=20)
set.seed(20133013)
regfit.bwd <- regsubsets(life_expectancy~.,data=data_EDA,nvmax=18,method="backward")
bwd.summary <- summary(regfit.bwd)
# fwd.summary
#set_plot_dimensions(8,6)
plot(bwd.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
which.min(bwd.summary$bic)
## [1] 10
points(10,bwd.summary$bic[10],col="red",cex=2,pch=20)
Tất cả các phương pháp với BIC đều cho ra kết quả là 10 biến.
v_names <- rownames(as.data.frame(coef(regfit.best,10)))
coefs<- data.frame(v_names)
coefs$best_coef_value<- coef(regfit.best,10)
coefs$fwd_coef_value <- coef(regfit.fwd,10)
coefs$bwd_coef_value <- coef(regfit.bwd,10)
#set_plot_dimensions(18,4)
ggplot(coefs,
aes(x=v_names, y=best_coef_value, fill=best_coef_value)) +
geom_bar(stat="identity") +
ggtitle("Features & coeffecients: [method Best]") +
xlab("Feature") + ylab("Coef value") +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplot(coefs,
aes(x=v_names, y=fwd_coef_value, fill=fwd_coef_value)) +
geom_bar(stat="identity") +
ggtitle("Features & coeffecients: [method Forward inclusion]") +
xlab("Feature") + ylab("Coef value") +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggplot(coefs,
aes(x=v_names, y=bwd_coef_value, fill=bwd_coef_value)) +
geom_bar(stat="identity") +
ggtitle("Feature & coeffecients: [method Backward elimination]") +
xlab("Feature") + ylab("Coef value") +
theme(axis.text.x=element_text(angle=45, hjust=1))
3 phương pháp đã chọn ra 10 biến. Tạo data:
data_select<-subset(data, select = c(life_expectancy, adult_mortality, percent_expenditure, BMI, polio, total_expenditure, diphtheria, `HIV/AIDS_mortality`, `thinness_1-19`,income_composition_of_resources, schooling ))
set.seed(20133077)
sample_select<-sample(c(TRUE, FALSE), nrow(data_select), replace=TRUE, prob=c(0.70,0.30))
train_select<-data_select[sample_select,]
x_select_test<-data_select[!sample_select,]
y_select_test<-data_select[!sample_select,]$life_expectancy
model_select<-lm(life_expectancy~.,data=train_select)
summary(model_select)
##
## Call:
## lm(formula = life_expectancy ~ ., data = train_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39439 -0.23390 0.03601 0.28103 2.07112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.302e+00 7.485e-02 84.203 < 2e-16 ***
## adult_mortality 2.000e-03 1.114e-04 17.948 < 2e-16 ***
## percent_expenditure -6.365e-05 5.901e-06 -10.787 < 2e-16 ***
## BMI -3.496e-03 6.828e-04 -5.121 3.33e-07 ***
## polio -2.464e-03 6.475e-04 -3.805 0.000146 ***
## total_expenditure -9.900e-03 4.609e-03 -2.148 0.031841 *
## diphtheria -3.181e-03 6.496e-04 -4.898 1.05e-06 ***
## `HIV/AIDS_mortality` 4.378e-02 2.382e-03 18.382 < 2e-16 ***
## `thinness_1-19` 1.278e-02 2.908e-03 4.396 1.16e-05 ***
## income_composition_of_resources -9.589e-01 8.995e-02 -10.661 < 2e-16 ***
## schooling -8.606e-02 5.841e-03 -14.733 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4808 on 2073 degrees of freedom
## Multiple R-squared: 0.7898, Adjusted R-squared: 0.7888
## F-statistic: 779.1 on 10 and 2073 DF, p-value: < 2.2e-16
Kiểm tra
pred_select<-predict(model_select, newdata = x_select_test)
value_model[nrow(value_model) + 1,] = c("BIC",rmse(y_select_test,pred_select), summary(model_select)$adj.r.squared, length(coef(model_select))-1)
value_model
## Model Root Mean Squared Error R squared Number of variable
## 1 Full 0.481022999289545 0.802073745562233 18
## 2 AIC 0.471017271177039 0.797046965097611 16
## 3 BIC 0.4841986829211 0.788832518734723 10
par(mfrow=c(2,2))
plot(model_select)
pred_acct_select <- data.frame(pred_select, y_select_test)
ggplot(pred_acct_select,aes(x=pred_select, y= y_select_test)) +
geom_point() +
geom_abline(intercept=0, slope=1) +
labs(x='Predicted Values', y='Actual Values', title='Predicted vs. Actual Values')